library(MASS)
library(haven)
library(Matrix)
library(lfe)
library(ggplot2)
library(RColorBrewer)
library(Hotelling)
library(did)

source("functions.R")

### run data_cleaning.R first
load("data_cleaned.RData")

### cross-validation for regularized time fixed-effects
# Table 1 of SA
CrossValidation <- list(eps=0,
                        maxiter=200,
                        n_of_initial_values=5000)
CrossValidation$Constant <- function(t){
  ans <- 1
  return(ans)
} # 1 variables

CrossValidation$Constant2 <- function(t){
  ans <- c(1,(t>6))
  return(ans)
} # 2 variables

CrossValidation$Constant3 <- function(t){
  ans <- c(1,(t>4),(t>7))
  return(ans)
} # 3 variables

CrossValidation$LinSpline <- function(t){
  ans <- c(1,t-1)
  return(ans)
} # 2 variables

CrossValidation$LinSpline2 <- function(t){
  ans <- c(1,t-1,(t-6)*(t>6))
  return(ans)
} # 3 variables


# Use pretreatment outcomes only
SubData <- KmeansData[KmeansData$time<=DataCleaning$T0 & 
                        KmeansData$time>0,
                      c(ncol(KmeansData),2:(ncol(KmeansData)-1))] 
SubData <- as.matrix(SubData)

# drop last three time periods for cross-validation
CrossValidation$result <- matrix(0,5,4)

for (TestTime in (DataCleaning$T0-2):DataCleaning$T0){
  TrainData <- SubData[SubData[,3]<TestTime,]
  TestData <- SubData[SubData[,3]==TestTime,]
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidation$Constant)
  CrossValidation$result[1,DataCleaning$T0-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidation$Constant(TestData[i,3]))))^2
    )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidation$Constant2)
  CrossValidation$result[2,DataCleaning$T0-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidation$Constant2(TestData[i,3]))))^2
    )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidation$Constant3)
  CrossValidation$result[3,DataCleaning$T0-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidation$Constant3(TestData[i,3]))))^2
    )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidation$LinSpline)
  CrossValidation$result[4,DataCleaning$T0-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidation$LinSpline(TestData[i,3]))))^2
    )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidation$LinSpline2)
  CrossValidation$result[5,DataCleaning$T0-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidation$LinSpline2(TestData[i,3]))))^2
    )
}
remove(temp,TestTime,TrainData,TestData)
CrossValidation$result[,4] <- apply(CrossValidation$result[,1:3],1,mean)
CrossValidation$result



### K-means clustering using entire time series
# Table 11 of SA
CrossValidationNT <- list(eps=0,
                          maxiter=200,
                          n_of_initial_values=5000)

CrossValidationNT$Constant <- function(t){
  ans <- 1
  return(ans)
} # 1 variables

CrossValidationNT$Constant2 <- function(t){
  ans <- c(1,(t>10))
  return(ans)
} # 2 variables

CrossValidationNT$Constant3 <- function(t){
  ans <- c(1,(t>7),(t>13))
  return(ans)
} # 3 variables

CrossValidationNT$LinSpline <- function(t){
  ans <- c(1,t-1)
  return(ans)
} # 2 variables

CrossValidationNT$LinSpline2 <- function(t){
  ans <- c(1,t-1,(t-10)*(t>10))
  return(ans)
} # 3 variables

# use never-treated units only
SubData <- KmeansData[KmeansData$E>T & KmeansData$time>0,
                      c(ncol(KmeansData),2:(ncol(KmeansData)-1))] 
temp <- unique(SubData$id)
SubData$id <- sapply(SubData$id,function(x) which(x==temp))
remove(temp)
SubData <- as.matrix(SubData)

# drop last three time periods for cross-validation
CrossValidationNT$result <- matrix(0,5,4)

for (TestTime in (T-2):T){
  TrainData <- SubData[SubData[,3]<TestTime,]
  TestData <- SubData[SubData[,3]==TestTime,]
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidationNT$Constant)
  CrossValidationNT$result[1,T-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidationNT$Constant(TestData[i,3]))))^2
    )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidationNT$Constant2)
  CrossValidationNT$result[2,T-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidationNT$Constant2(TestData[i,3]))))^2
    )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidationNT$Constant3)
  CrossValidationNT$result[3,T-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidationNT$Constant3(TestData[i,3]))))^2
    )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidationNT$LinSpline)
  CrossValidationNT$result[4,T-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidationNT$LinSpline(TestData[i,3]))))^2
    )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidationNT$LinSpline2)
  CrossValidationNT$result[5,T-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidationNT$LinSpline2(TestData[i,3]))))^2
    )
}
remove(temp,TestTime,TrainData,TestData)
CrossValidationNT$result[,4] <- apply(CrossValidationNT$result[,1:3],1,mean)
CrossValidationNT$result